chunk options
CSS for scrollable output & Header colors
Turning scientific / Exponential numbers off
options(scipen = 999)library(tidyverse)
library(ggthemes)library(GGally)
library(caret)
library(tidymodels)
# library(data.table)
library(DT)
theme_viny_bright <- function(){
library(ggthemes)
ggthemes::theme_fivethirtyeight() %+replace%
theme(
axis.title = element_text(),
axis.text = element_text(size = 13),
legend.text = element_text(size = 10),
panel.background = element_rect(fill = "white"),
plot.background = element_rect(fill = "white"),
strip.background = element_blank(),
legend.background = element_rect(fill = NA),
legend.key = element_rect(fill = NA),
plot.title = element_text(hjust = 0.5,
size = 19,
face = "bold"),
plot.subtitle = element_text(hjust = 0.5, colour = "maroon")
)
}
theme_set(theme_viny_bright())dt1 <- read.csv("../../../../2.AnalytixLabs Practice/Home Credit - Credit Default Risk/Data/application_train.csv")
test <- read.csv("../../../../2.AnalytixLabs Practice/Home Credit - Credit Default Risk/Data/application_test.csv")dt1 %>% head()str(dt1)'data.frame': 307511 obs. of 122 variables:
$ SK_ID_CURR : int 100002 100003 100004 100006 100007 100008 100009 100010 100011 100012 ...
$ TARGET : int 1 0 0 0 0 0 0 0 0 0 ...
$ NAME_CONTRACT_TYPE : chr "Cash loans" "Cash loans" "Revolving loans" "Cash loans" ...
$ CODE_GENDER : chr "M" "F" "M" "F" ...
$ FLAG_OWN_CAR : chr "N" "N" "Y" "N" ...
$ FLAG_OWN_REALTY : chr "Y" "N" "Y" "Y" ...
$ CNT_CHILDREN : int 0 0 0 0 0 0 1 0 0 0 ...
$ AMT_INCOME_TOTAL : num 202500 270000 67500 135000 121500 ...
$ AMT_CREDIT : num 406598 1293503 135000 312683 513000 ...
$ AMT_ANNUITY : num 24701 35699 6750 29687 21866 ...
$ AMT_GOODS_PRICE : num 351000 1129500 135000 297000 513000 ...
$ NAME_TYPE_SUITE : chr "Unaccompanied" "Family" "Unaccompanied" "Unaccompanied" ...
$ NAME_INCOME_TYPE : chr "Working" "State servant" "Working" "Working" ...
$ NAME_EDUCATION_TYPE : chr "Secondary / secondary special" "Higher education" "Secondary / secondary special" "Secondary / secondary special" ...
$ NAME_FAMILY_STATUS : chr "Single / not married" "Married" "Single / not married" "Civil marriage" ...
$ NAME_HOUSING_TYPE : chr "House / apartment" "House / apartment" "House / apartment" "House / apartment" ...
$ REGION_POPULATION_RELATIVE : num 0.0188 0.00354 0.01003 0.00802 0.02866 ...
$ DAYS_BIRTH : int -9461 -16765 -19046 -19005 -19932 -16941 -13778 -18850 -20099 -14469 ...
$ DAYS_EMPLOYED : int -637 -1188 -225 -3039 -3038 -1588 -3130 -449 365243 -2019 ...
$ DAYS_REGISTRATION : num -3648 -1186 -4260 -9833 -4311 ...
$ DAYS_ID_PUBLISH : int -2120 -291 -2531 -2437 -3458 -477 -619 -2379 -3514 -3992 ...
$ OWN_CAR_AGE : num NA NA 26 NA NA NA 17 8 NA NA ...
$ FLAG_MOBIL : int 1 1 1 1 1 1 1 1 1 1 ...
$ FLAG_EMP_PHONE : int 1 1 1 1 1 1 1 1 0 1 ...
$ FLAG_WORK_PHONE : int 0 0 1 0 0 1 0 1 0 0 ...
$ FLAG_CONT_MOBILE : int 1 1 1 1 1 1 1 1 1 1 ...
$ FLAG_PHONE : int 1 1 1 0 0 1 1 0 0 0 ...
$ FLAG_EMAIL : int 0 0 0 0 0 0 0 0 0 0 ...
$ OCCUPATION_TYPE : chr "Laborers" "Core staff" "Laborers" "Laborers" ...
$ CNT_FAM_MEMBERS : num 1 2 1 2 1 2 3 2 2 1 ...
$ REGION_RATING_CLIENT : int 2 1 2 2 2 2 2 3 2 2 ...
$ REGION_RATING_CLIENT_W_CITY : int 2 1 2 2 2 2 2 3 2 2 ...
$ WEEKDAY_APPR_PROCESS_START : chr "WEDNESDAY" "MONDAY" "MONDAY" "WEDNESDAY" ...
$ HOUR_APPR_PROCESS_START : int 10 11 9 17 11 16 16 16 14 8 ...
$ REG_REGION_NOT_LIVE_REGION : int 0 0 0 0 0 0 0 0 0 0 ...
$ REG_REGION_NOT_WORK_REGION : int 0 0 0 0 0 0 0 0 0 0 ...
$ LIVE_REGION_NOT_WORK_REGION : int 0 0 0 0 0 0 0 0 0 0 ...
$ REG_CITY_NOT_LIVE_CITY : int 0 0 0 0 0 0 0 0 0 0 ...
$ REG_CITY_NOT_WORK_CITY : int 0 0 0 0 1 0 0 1 0 0 ...
$ LIVE_CITY_NOT_WORK_CITY : int 0 0 0 0 1 0 0 1 0 0 ...
$ ORGANIZATION_TYPE : chr "Business Entity Type 3" "School" "Government" "Business Entity Type 3" ...
$ EXT_SOURCE_1 : num 0.083 0.311 NA NA NA ...
$ EXT_SOURCE_2 : num 0.263 0.622 0.556 0.65 0.323 ...
$ EXT_SOURCE_3 : num 0.139 NA 0.73 NA NA ...
$ APARTMENTS_AVG : num 0.0247 0.0959 NA NA NA NA NA NA NA NA ...
$ BASEMENTAREA_AVG : num 0.0369 0.0529 NA NA NA NA NA NA NA NA ...
$ YEARS_BEGINEXPLUATATION_AVG : num 0.972 0.985 NA NA NA ...
$ YEARS_BUILD_AVG : num 0.619 0.796 NA NA NA ...
$ COMMONAREA_AVG : num 0.0143 0.0605 NA NA NA NA NA NA NA NA ...
$ ELEVATORS_AVG : num 0 0.08 NA NA NA NA NA NA NA NA ...
$ ENTRANCES_AVG : num 0.069 0.0345 NA NA NA NA NA NA NA NA ...
$ FLOORSMAX_AVG : num 0.0833 0.2917 NA NA NA ...
$ FLOORSMIN_AVG : num 0.125 0.333 NA NA NA ...
$ LANDAREA_AVG : num 0.0369 0.013 NA NA NA NA NA NA NA NA ...
$ LIVINGAPARTMENTS_AVG : num 0.0202 0.0773 NA NA NA NA NA NA NA NA ...
$ LIVINGAREA_AVG : num 0.019 0.0549 NA NA NA NA NA NA NA NA ...
$ NONLIVINGAPARTMENTS_AVG : num 0 0.0039 NA NA NA NA NA NA NA NA ...
$ NONLIVINGAREA_AVG : num 0 0.0098 NA NA NA NA NA NA NA NA ...
$ APARTMENTS_MODE : num 0.0252 0.0924 NA NA NA NA NA NA NA NA ...
$ BASEMENTAREA_MODE : num 0.0383 0.0538 NA NA NA NA NA NA NA NA ...
$ YEARS_BEGINEXPLUATATION_MODE: num 0.972 0.985 NA NA NA ...
$ YEARS_BUILD_MODE : num 0.634 0.804 NA NA NA ...
$ COMMONAREA_MODE : num 0.0144 0.0497 NA NA NA NA NA NA NA NA ...
$ ELEVATORS_MODE : num 0 0.0806 NA NA NA NA NA NA NA NA ...
$ ENTRANCES_MODE : num 0.069 0.0345 NA NA NA NA NA NA NA NA ...
$ FLOORSMAX_MODE : num 0.0833 0.2917 NA NA NA ...
$ FLOORSMIN_MODE : num 0.125 0.333 NA NA NA ...
$ LANDAREA_MODE : num 0.0377 0.0128 NA NA NA NA NA NA NA NA ...
$ LIVINGAPARTMENTS_MODE : num 0.022 0.079 NA NA NA NA NA NA NA NA ...
$ LIVINGAREA_MODE : num 0.0198 0.0554 NA NA NA NA NA NA NA NA ...
$ NONLIVINGAPARTMENTS_MODE : num 0 0 NA NA NA NA NA NA NA NA ...
$ NONLIVINGAREA_MODE : num 0 0 NA NA NA NA NA NA NA NA ...
$ APARTMENTS_MEDI : num 0.025 0.0968 NA NA NA NA NA NA NA NA ...
$ BASEMENTAREA_MEDI : num 0.0369 0.0529 NA NA NA NA NA NA NA NA ...
$ YEARS_BEGINEXPLUATATION_MEDI: num 0.972 0.985 NA NA NA ...
$ YEARS_BUILD_MEDI : num 0.624 0.799 NA NA NA ...
$ COMMONAREA_MEDI : num 0.0144 0.0608 NA NA NA NA NA NA NA NA ...
$ ELEVATORS_MEDI : num 0 0.08 NA NA NA NA NA NA NA NA ...
$ ENTRANCES_MEDI : num 0.069 0.0345 NA NA NA NA NA NA NA NA ...
$ FLOORSMAX_MEDI : num 0.0833 0.2917 NA NA NA ...
$ FLOORSMIN_MEDI : num 0.125 0.333 NA NA NA ...
$ LANDAREA_MEDI : num 0.0375 0.0132 NA NA NA NA NA NA NA NA ...
$ LIVINGAPARTMENTS_MEDI : num 0.0205 0.0787 NA NA NA NA NA NA NA NA ...
$ LIVINGAREA_MEDI : num 0.0193 0.0558 NA NA NA NA NA NA NA NA ...
$ NONLIVINGAPARTMENTS_MEDI : num 0 0.0039 NA NA NA NA NA NA NA NA ...
$ NONLIVINGAREA_MEDI : num 0 0.01 NA NA NA NA NA NA NA NA ...
$ FONDKAPREMONT_MODE : chr "reg oper account" "reg oper account" "" "" ...
$ HOUSETYPE_MODE : chr "block of flats" "block of flats" "" "" ...
$ TOTALAREA_MODE : num 0.0149 0.0714 NA NA NA NA NA NA NA NA ...
$ WALLSMATERIAL_MODE : chr "Stone, brick" "Block" "" "" ...
$ EMERGENCYSTATE_MODE : chr "No" "No" "" "" ...
$ OBS_30_CNT_SOCIAL_CIRCLE : num 2 1 0 2 0 0 1 2 1 2 ...
$ DEF_30_CNT_SOCIAL_CIRCLE : num 2 0 0 0 0 0 0 0 0 0 ...
$ OBS_60_CNT_SOCIAL_CIRCLE : num 2 1 0 2 0 0 1 2 1 2 ...
$ DEF_60_CNT_SOCIAL_CIRCLE : num 2 0 0 0 0 0 0 0 0 0 ...
$ DAYS_LAST_PHONE_CHANGE : num -1134 -828 -815 -617 -1106 ...
$ FLAG_DOCUMENT_2 : int 0 0 0 0 0 0 0 0 0 0 ...
$ FLAG_DOCUMENT_3 : int 1 1 0 1 0 1 0 1 1 0 ...
$ FLAG_DOCUMENT_4 : int 0 0 0 0 0 0 0 0 0 0 ...
[list output truncated]
# colSums(dt1, is.na)
# this gives errorna_count <- colSums(is.na(dt1))
na_count SK_ID_CURR TARGET NAME_CONTRACT_TYPE
0 0 0
CODE_GENDER FLAG_OWN_CAR FLAG_OWN_REALTY
0 0 0
CNT_CHILDREN AMT_INCOME_TOTAL AMT_CREDIT
0 0 0
AMT_ANNUITY AMT_GOODS_PRICE NAME_TYPE_SUITE
12 278 0
NAME_INCOME_TYPE NAME_EDUCATION_TYPE NAME_FAMILY_STATUS
0 0 0
NAME_HOUSING_TYPE REGION_POPULATION_RELATIVE DAYS_BIRTH
0 0 0
DAYS_EMPLOYED DAYS_REGISTRATION DAYS_ID_PUBLISH
0 0 0
OWN_CAR_AGE FLAG_MOBIL FLAG_EMP_PHONE
202929 0 0
FLAG_WORK_PHONE FLAG_CONT_MOBILE FLAG_PHONE
0 0 0
FLAG_EMAIL OCCUPATION_TYPE CNT_FAM_MEMBERS
0 0 2
REGION_RATING_CLIENT REGION_RATING_CLIENT_W_CITY WEEKDAY_APPR_PROCESS_START
0 0 0
HOUR_APPR_PROCESS_START REG_REGION_NOT_LIVE_REGION REG_REGION_NOT_WORK_REGION
0 0 0
LIVE_REGION_NOT_WORK_REGION REG_CITY_NOT_LIVE_CITY REG_CITY_NOT_WORK_CITY
0 0 0
LIVE_CITY_NOT_WORK_CITY ORGANIZATION_TYPE EXT_SOURCE_1
0 0 173378
EXT_SOURCE_2 EXT_SOURCE_3 APARTMENTS_AVG
660 60965 156061
BASEMENTAREA_AVG YEARS_BEGINEXPLUATATION_AVG YEARS_BUILD_AVG
179943 150007 204488
COMMONAREA_AVG ELEVATORS_AVG ENTRANCES_AVG
214865 163891 154828
FLOORSMAX_AVG FLOORSMIN_AVG LANDAREA_AVG
153020 208642 182590
LIVINGAPARTMENTS_AVG LIVINGAREA_AVG NONLIVINGAPARTMENTS_AVG
210199 154350 213514
NONLIVINGAREA_AVG APARTMENTS_MODE BASEMENTAREA_MODE
169682 156061 179943
YEARS_BEGINEXPLUATATION_MODE YEARS_BUILD_MODE COMMONAREA_MODE
150007 204488 214865
ELEVATORS_MODE ENTRANCES_MODE FLOORSMAX_MODE
163891 154828 153020
FLOORSMIN_MODE LANDAREA_MODE LIVINGAPARTMENTS_MODE
208642 182590 210199
LIVINGAREA_MODE NONLIVINGAPARTMENTS_MODE NONLIVINGAREA_MODE
154350 213514 169682
APARTMENTS_MEDI BASEMENTAREA_MEDI YEARS_BEGINEXPLUATATION_MEDI
156061 179943 150007
YEARS_BUILD_MEDI COMMONAREA_MEDI ELEVATORS_MEDI
204488 214865 163891
ENTRANCES_MEDI FLOORSMAX_MEDI FLOORSMIN_MEDI
154828 153020 208642
LANDAREA_MEDI LIVINGAPARTMENTS_MEDI LIVINGAREA_MEDI
182590 210199 154350
NONLIVINGAPARTMENTS_MEDI NONLIVINGAREA_MEDI FONDKAPREMONT_MODE
213514 169682 0
HOUSETYPE_MODE TOTALAREA_MODE WALLSMATERIAL_MODE
0 148431 0
EMERGENCYSTATE_MODE OBS_30_CNT_SOCIAL_CIRCLE DEF_30_CNT_SOCIAL_CIRCLE
0 1021 1021
OBS_60_CNT_SOCIAL_CIRCLE DEF_60_CNT_SOCIAL_CIRCLE DAYS_LAST_PHONE_CHANGE
1021 1021 1
FLAG_DOCUMENT_2 FLAG_DOCUMENT_3 FLAG_DOCUMENT_4
0 0 0
FLAG_DOCUMENT_5 FLAG_DOCUMENT_6 FLAG_DOCUMENT_7
0 0 0
FLAG_DOCUMENT_8 FLAG_DOCUMENT_9 FLAG_DOCUMENT_10
0 0 0
FLAG_DOCUMENT_11 FLAG_DOCUMENT_12 FLAG_DOCUMENT_13
0 0 0
FLAG_DOCUMENT_14 FLAG_DOCUMENT_15 FLAG_DOCUMENT_16
0 0 0
FLAG_DOCUMENT_17 FLAG_DOCUMENT_18 FLAG_DOCUMENT_19
0 0 0
FLAG_DOCUMENT_20 FLAG_DOCUMENT_21 AMT_REQ_CREDIT_BUREAU_HOUR
0 0 41519
AMT_REQ_CREDIT_BUREAU_DAY AMT_REQ_CREDIT_BUREAU_WEEK AMT_REQ_CREDIT_BUREAU_MON
41519 41519 41519
AMT_REQ_CREDIT_BUREAU_QRT AMT_REQ_CREDIT_BUREAU_YEAR
41519 41519
na_count %>%
as.data.frame() %>%
arrange(desc(.))missing data percentage
round(na_count / dim(dt1)[1] * 100, digits = 2) SK_ID_CURR TARGET NAME_CONTRACT_TYPE
0.00 0.00 0.00
CODE_GENDER FLAG_OWN_CAR FLAG_OWN_REALTY
0.00 0.00 0.00
CNT_CHILDREN AMT_INCOME_TOTAL AMT_CREDIT
0.00 0.00 0.00
AMT_ANNUITY AMT_GOODS_PRICE NAME_TYPE_SUITE
0.00 0.09 0.00
NAME_INCOME_TYPE NAME_EDUCATION_TYPE NAME_FAMILY_STATUS
0.00 0.00 0.00
NAME_HOUSING_TYPE REGION_POPULATION_RELATIVE DAYS_BIRTH
0.00 0.00 0.00
DAYS_EMPLOYED DAYS_REGISTRATION DAYS_ID_PUBLISH
0.00 0.00 0.00
OWN_CAR_AGE FLAG_MOBIL FLAG_EMP_PHONE
65.99 0.00 0.00
FLAG_WORK_PHONE FLAG_CONT_MOBILE FLAG_PHONE
0.00 0.00 0.00
FLAG_EMAIL OCCUPATION_TYPE CNT_FAM_MEMBERS
0.00 0.00 0.00
REGION_RATING_CLIENT REGION_RATING_CLIENT_W_CITY WEEKDAY_APPR_PROCESS_START
0.00 0.00 0.00
HOUR_APPR_PROCESS_START REG_REGION_NOT_LIVE_REGION REG_REGION_NOT_WORK_REGION
0.00 0.00 0.00
LIVE_REGION_NOT_WORK_REGION REG_CITY_NOT_LIVE_CITY REG_CITY_NOT_WORK_CITY
0.00 0.00 0.00
LIVE_CITY_NOT_WORK_CITY ORGANIZATION_TYPE EXT_SOURCE_1
0.00 0.00 56.38
EXT_SOURCE_2 EXT_SOURCE_3 APARTMENTS_AVG
0.21 19.83 50.75
BASEMENTAREA_AVG YEARS_BEGINEXPLUATATION_AVG YEARS_BUILD_AVG
58.52 48.78 66.50
COMMONAREA_AVG ELEVATORS_AVG ENTRANCES_AVG
69.87 53.30 50.35
FLOORSMAX_AVG FLOORSMIN_AVG LANDAREA_AVG
49.76 67.85 59.38
LIVINGAPARTMENTS_AVG LIVINGAREA_AVG NONLIVINGAPARTMENTS_AVG
68.35 50.19 69.43
NONLIVINGAREA_AVG APARTMENTS_MODE BASEMENTAREA_MODE
55.18 50.75 58.52
YEARS_BEGINEXPLUATATION_MODE YEARS_BUILD_MODE COMMONAREA_MODE
48.78 66.50 69.87
ELEVATORS_MODE ENTRANCES_MODE FLOORSMAX_MODE
53.30 50.35 49.76
FLOORSMIN_MODE LANDAREA_MODE LIVINGAPARTMENTS_MODE
67.85 59.38 68.35
LIVINGAREA_MODE NONLIVINGAPARTMENTS_MODE NONLIVINGAREA_MODE
50.19 69.43 55.18
APARTMENTS_MEDI BASEMENTAREA_MEDI YEARS_BEGINEXPLUATATION_MEDI
50.75 58.52 48.78
YEARS_BUILD_MEDI COMMONAREA_MEDI ELEVATORS_MEDI
66.50 69.87 53.30
ENTRANCES_MEDI FLOORSMAX_MEDI FLOORSMIN_MEDI
50.35 49.76 67.85
LANDAREA_MEDI LIVINGAPARTMENTS_MEDI LIVINGAREA_MEDI
59.38 68.35 50.19
NONLIVINGAPARTMENTS_MEDI NONLIVINGAREA_MEDI FONDKAPREMONT_MODE
69.43 55.18 0.00
HOUSETYPE_MODE TOTALAREA_MODE WALLSMATERIAL_MODE
0.00 48.27 0.00
EMERGENCYSTATE_MODE OBS_30_CNT_SOCIAL_CIRCLE DEF_30_CNT_SOCIAL_CIRCLE
0.00 0.33 0.33
OBS_60_CNT_SOCIAL_CIRCLE DEF_60_CNT_SOCIAL_CIRCLE DAYS_LAST_PHONE_CHANGE
0.33 0.33 0.00
FLAG_DOCUMENT_2 FLAG_DOCUMENT_3 FLAG_DOCUMENT_4
0.00 0.00 0.00
FLAG_DOCUMENT_5 FLAG_DOCUMENT_6 FLAG_DOCUMENT_7
0.00 0.00 0.00
FLAG_DOCUMENT_8 FLAG_DOCUMENT_9 FLAG_DOCUMENT_10
0.00 0.00 0.00
FLAG_DOCUMENT_11 FLAG_DOCUMENT_12 FLAG_DOCUMENT_13
0.00 0.00 0.00
FLAG_DOCUMENT_14 FLAG_DOCUMENT_15 FLAG_DOCUMENT_16
0.00 0.00 0.00
FLAG_DOCUMENT_17 FLAG_DOCUMENT_18 FLAG_DOCUMENT_19
0.00 0.00 0.00
FLAG_DOCUMENT_20 FLAG_DOCUMENT_21 AMT_REQ_CREDIT_BUREAU_HOUR
0.00 0.00 13.50
AMT_REQ_CREDIT_BUREAU_DAY AMT_REQ_CREDIT_BUREAU_WEEK AMT_REQ_CREDIT_BUREAU_MON
13.50 13.50 13.50
AMT_REQ_CREDIT_BUREAU_QRT AMT_REQ_CREDIT_BUREAU_YEAR
13.50 13.50
round(na_count / dim(dt1)[1] * 100, digits = 2) %>%
as.data.frame() %>%
arrange(desc(.))mv_data <- round(na_count / dim(dt1)[1] * 100, digits = 2) %>%
as.data.frame() %>%
arrange(desc(.)) %>%
rownames_to_column(var = "var_names") %>%
mutate(var_names = as.factor(var_names)) %>%
rename(missing_values = ".")
mv_data
mv_data %>% head(n = 40) %>%
ggplot(aes(x = missing_values/100, y = reorder(var_names, missing_values))) +
geom_col() +
theme_classic() +
ylab("column names") +
scale_x_continuous(labels = scales::percent)
# since we are using scales::percent so had to divide missing values by 100 summarise_all(dt1, funs(n_distinct, typeof))
# this has results in double columns i.e 244 instead of double rows As per kagglers 365243 needs to be marked as missing value
dt1$DAYS_EMPLOYED[dt1$DAYS_EMPLOYED == 365243] <- NAsum(is.na(dt1$DAYS_EMPLOYED))[1] 55374
sapply(dt1, FUN = function(x) all(x <= 0, na.rm = TRUE)) SK_ID_CURR TARGET NAME_CONTRACT_TYPE
FALSE FALSE FALSE
CODE_GENDER FLAG_OWN_CAR FLAG_OWN_REALTY
FALSE FALSE FALSE
CNT_CHILDREN AMT_INCOME_TOTAL AMT_CREDIT
FALSE FALSE FALSE
AMT_ANNUITY AMT_GOODS_PRICE NAME_TYPE_SUITE
FALSE FALSE FALSE
NAME_INCOME_TYPE NAME_EDUCATION_TYPE NAME_FAMILY_STATUS
FALSE FALSE FALSE
NAME_HOUSING_TYPE REGION_POPULATION_RELATIVE DAYS_BIRTH
FALSE FALSE TRUE
DAYS_EMPLOYED DAYS_REGISTRATION DAYS_ID_PUBLISH
TRUE TRUE TRUE
OWN_CAR_AGE FLAG_MOBIL FLAG_EMP_PHONE
FALSE FALSE FALSE
FLAG_WORK_PHONE FLAG_CONT_MOBILE FLAG_PHONE
FALSE FALSE FALSE
FLAG_EMAIL OCCUPATION_TYPE CNT_FAM_MEMBERS
FALSE FALSE FALSE
REGION_RATING_CLIENT REGION_RATING_CLIENT_W_CITY WEEKDAY_APPR_PROCESS_START
FALSE FALSE FALSE
HOUR_APPR_PROCESS_START REG_REGION_NOT_LIVE_REGION REG_REGION_NOT_WORK_REGION
FALSE FALSE FALSE
LIVE_REGION_NOT_WORK_REGION REG_CITY_NOT_LIVE_CITY REG_CITY_NOT_WORK_CITY
FALSE FALSE FALSE
LIVE_CITY_NOT_WORK_CITY ORGANIZATION_TYPE EXT_SOURCE_1
FALSE FALSE FALSE
EXT_SOURCE_2 EXT_SOURCE_3 APARTMENTS_AVG
FALSE FALSE FALSE
BASEMENTAREA_AVG YEARS_BEGINEXPLUATATION_AVG YEARS_BUILD_AVG
FALSE FALSE FALSE
COMMONAREA_AVG ELEVATORS_AVG ENTRANCES_AVG
FALSE FALSE FALSE
FLOORSMAX_AVG FLOORSMIN_AVG LANDAREA_AVG
FALSE FALSE FALSE
LIVINGAPARTMENTS_AVG LIVINGAREA_AVG NONLIVINGAPARTMENTS_AVG
FALSE FALSE FALSE
NONLIVINGAREA_AVG APARTMENTS_MODE BASEMENTAREA_MODE
FALSE FALSE FALSE
YEARS_BEGINEXPLUATATION_MODE YEARS_BUILD_MODE COMMONAREA_MODE
FALSE FALSE FALSE
ELEVATORS_MODE ENTRANCES_MODE FLOORSMAX_MODE
FALSE FALSE FALSE
FLOORSMIN_MODE LANDAREA_MODE LIVINGAPARTMENTS_MODE
FALSE FALSE FALSE
LIVINGAREA_MODE NONLIVINGAPARTMENTS_MODE NONLIVINGAREA_MODE
FALSE FALSE FALSE
APARTMENTS_MEDI BASEMENTAREA_MEDI YEARS_BEGINEXPLUATATION_MEDI
FALSE FALSE FALSE
YEARS_BUILD_MEDI COMMONAREA_MEDI ELEVATORS_MEDI
FALSE FALSE FALSE
ENTRANCES_MEDI FLOORSMAX_MEDI FLOORSMIN_MEDI
FALSE FALSE FALSE
LANDAREA_MEDI LIVINGAPARTMENTS_MEDI LIVINGAREA_MEDI
FALSE FALSE FALSE
NONLIVINGAPARTMENTS_MEDI NONLIVINGAREA_MEDI FONDKAPREMONT_MODE
FALSE FALSE FALSE
HOUSETYPE_MODE TOTALAREA_MODE WALLSMATERIAL_MODE
FALSE FALSE FALSE
EMERGENCYSTATE_MODE OBS_30_CNT_SOCIAL_CIRCLE DEF_30_CNT_SOCIAL_CIRCLE
FALSE FALSE FALSE
OBS_60_CNT_SOCIAL_CIRCLE DEF_60_CNT_SOCIAL_CIRCLE DAYS_LAST_PHONE_CHANGE
FALSE FALSE TRUE
FLAG_DOCUMENT_2 FLAG_DOCUMENT_3 FLAG_DOCUMENT_4
FALSE FALSE FALSE
FLAG_DOCUMENT_5 FLAG_DOCUMENT_6 FLAG_DOCUMENT_7
FALSE FALSE FALSE
FLAG_DOCUMENT_8 FLAG_DOCUMENT_9 FLAG_DOCUMENT_10
FALSE FALSE FALSE
FLAG_DOCUMENT_11 FLAG_DOCUMENT_12 FLAG_DOCUMENT_13
FALSE FALSE FALSE
FLAG_DOCUMENT_14 FLAG_DOCUMENT_15 FLAG_DOCUMENT_16
FALSE FALSE FALSE
FLAG_DOCUMENT_17 FLAG_DOCUMENT_18 FLAG_DOCUMENT_19
FALSE FALSE FALSE
FLAG_DOCUMENT_20 FLAG_DOCUMENT_21 AMT_REQ_CREDIT_BUREAU_HOUR
FALSE FALSE FALSE
AMT_REQ_CREDIT_BUREAU_DAY AMT_REQ_CREDIT_BUREAU_WEEK AMT_REQ_CREDIT_BUREAU_MON
FALSE FALSE FALSE
AMT_REQ_CREDIT_BUREAU_QRT AMT_REQ_CREDIT_BUREAU_YEAR
FALSE FALSE
dt1[,sapply(dt1, FUN = function(x) all(x <= 0, na.rm = TRUE))] dt1[,sapply(dt1, FUN = function(x) all(x <= 0, na.rm = TRUE)) == TRUE] # dt1[,lapply(dt1, FUN = function(x) all(x <= 0, na.rm = TRUE))]
# this gives errordt1[,lapply(dt1, FUN = function(x) all(x <= 0, na.rm = TRUE)) == TRUE] since data.table is altering the basic r functionality so adding with = FALSE to get it done
answered on stackoverflow: https://stackoverflow.com/questions/65089990/how-to-filter-true-values-from-logical-results-of-sapply/65090049
# dt1[,sapply(dt1, FUN = function(x) all(x <= 0, na.rm = TRUE)), with = FALSE]
# this returns error as data.table lib is removedso removing data.table lib from here onwards
# detach("package:data.table", unload = TRUE)# dt1 %>% select(across(everything(), ~ is.numeric(.x)))
# this gives error# dt1 %>% select(across(everything(), is.numeric))
# this gives errordt1 %>% select_if(is.numeric)from: https://youtu.be/OeHz3bkOo5c?t=283
dt1 %>%
mutate(row_key = row_number()) %>% head()dt1 %>%
mutate(row_key = row_number()) %>%
gather(key = "key", value = "value", -row_key) %>%
head()dt1 %>%
mutate(row_key = row_number()) %>%
gather(key = "key", value = "value", -row_key) %>%
filter(value %>% is.na()) %>%
count(row_key, sort = TRUE)summarise_all(dt1, n_distinct) summarise_all(dt1, n_distinct) %>%
pivot_longer(., cols = everything(),
names_to = "var_names",
values_to = "unique_count") %>%
arrange(desc(unique_count) )summarise_all(dt1, n_distinct) %>%
pivot_longer(., cols = everything(),
names_to = "var_names",
values_to = "unique_count") %>%
arrange(unique_count) summarise_all(dt1, n_distinct) %>%
pivot_longer(., cols = everything(),
names_to = "var_names",
values_to = "unique_count") %>%
filter(unique_count < 100) %>%
ggplot(aes(y =reorder(var_names, desc(unique_count)), x = unique_count)) +
geom_point() +
theme_light()dt1 %>% select(TARGET) %>%
ggplot(aes(x = TARGET)) +
geom_histogram(bins = 100, fill="#0072B2", alpha = .9) +
xlab("TARGET") +
theme_light() +
theme(axis.text.x = element_text(angle = 90, hjust =1))dt1 %>%
select(1:10) %>%
select_if(is.numeric) %>% head()dt1 %>%
select(1:6) %>%
select_if(is.numeric) %>%
imap(function(feature_value, feature_name){
df <- data.frame(x = feature_value)
names(df) <- feature_name
df %>%
ggplot(aes(x = feature_value)) +
geom_histogram(bins = 100, fill="#0072B2", alpha = .9) +
xlab(feature_name) +
theme_light() +
theme(axis.text.x = element_text(angle = 90, hjust =1))
})$SK_ID_CURR
$TARGET
func_plot_hist <- function(data, i){
df <- data.frame(x = data[[i]])
p <- ggplot(data = df, aes(x = x)) +
geom_histogram(bins = 100, fill="#0072B2", alpha = .9) +
labs(title = paste0(i,"- ", colnames(data)[[i]]),
x = NULL) +
theme_classic() +
theme(axis.text.x = element_text(angle = 90, hjust =1))
return(p)
}func_doPlots <- function(data, func, ii, ncol=3) {
pp <- list()
for (i in ii) {
p <- func(data = data, i = i)
pp <- c(pp, list(p))
}
do.call("grid.arrange", c(pp, ncol = ncol))
}library(grid)
library(gridExtra)
# get library grid, gridExtra otherwise it will not have grid.arrange & will give error
func_doPlots(dt1 %>% select_if(is.numeric), func = func_plot_hist, ii = 1:20)func_doPlots(dt1 %>% select_if(is.numeric), func = func_plot_hist, ii = 21:40)func_doPlots(dt1 %>% select_if(is.numeric), func = func_plot_hist, ii = 41:60)func_doPlots(dt1 %>% select_if(is.numeric), func = func_plot_hist, ii = 61:80)func_doPlots(dt1 %>% select_if(is.numeric), func = func_plot_hist, ii = 81:100)func_doPlots(dt1 %>% select_if(is.numeric), func = func_plot_hist, ii = 101:106)func_plot_dens <- function(data, i){
df <- data.frame(x = data[[i]])
# print(colnames(data))
p <- ggplot(data = df, aes(x = x)) +
geom_density(aes(group = as.factor(dt1$TARGET),
color = as.factor(dt1$TARGET),
fill = as.factor(dt1$TARGET),
alpha = 0.2
)) +
labs(title = paste0(i,"- ", colnames(data)[[i]]),
x = NULL) +
theme_light() +
theme(axis.text.x = element_text(angle = 90, hjust =1),
legend.position = "none",
plot.title = element_text(size = 10)
)
return(p)
}func_doPlots(data = dt1 %>% select_if(is.numeric), func = func_plot_dens, ii = 21:40)func_doPlots(data = dt1 %>% select_if(is.numeric),
func = func_plot_dens,
ii = 101:ncol(dt1 %>% select_if(is.numeric)))ggplot(data = dt1, aes(y = AMT_CREDIT,
x = as.factor(TARGET),
color = as.factor(dt1$TARGET)
# fill = as.factor(dt1$TARGET)
)) +
geom_boxplot(aes(alpha = 0.2)) +
# geom_jitter(fill = as.factor(dt1$TARGET), alpha = 0.2) +
# labs(title = paste0(i,"- ", colnames(data)[[i]]),
# x = NULL) +
theme_light() +
theme(axis.text.x = element_text(angle = 90, hjust =1),
legend.position = "none",
plot.title = element_text(size = 10)
)func_plot_box <- function(data, i){
df <- data.frame(selected_var = data[[i]])
p <- ggplot(data = df, aes(y = selected_var,
x = as.factor(dt1$TARGET),
fill = as.factor(dt1$TARGET)
)) +
geom_boxplot() +
labs(title = paste0(i,"- ", colnames(data)[[i]]),
x = NULL) +
theme_light() +
theme(axis.text.x = element_text(angle = 90, hjust =1),
legend.position = "none",
plot.title = element_text(size = 10)
)
return(p)
}func_doPlots(data = dt1 %>% select_if(is.numeric),
func = func_plot_box,
ii = 1:20)func_doPlots(data = dt1 %>% select_if(is.numeric),
func = func_plot_box,
ii = 21:40)func_doPlots(data = dt1 %>% select_if(is.numeric),
func = func_plot_box,
ii = 41:60)func_doPlots(data = dt1 %>% select_if(is.numeric),
func = func_plot_box,
ii = 61:80)func_doPlots(data = dt1 %>% select_if(is.numeric),
func = func_plot_box,
ii = 81:100)func_doPlots(data = dt1 %>% select_if(is.numeric),
func = func_plot_box,
ii = 101:ncol(dt1 %>% select_if(is.numeric)))library(ggeasy)dt1 %>%
select(TARGET, NAME_CONTRACT_TYPE, NAME_INCOME_TYPE, NAME_EDUCATION_TYPE) %>%
# NAME_FAMILY_STATUS, NAME_HOUSING_TYPE, TARGET) %>%
GGally::ggpairs(aes(color = TARGET)) +
theme_light() +
ggeasy::easy_rotate_x_labels(angle = 90) +
ggeasy::easy_all_text_size(size = 10)dt1 %>%
select(TARGET, NAME_CONTRACT_TYPE, DAYS_BIRTH, DAYS_EMPLOYED) %>%
# NAME_FAMILY_STATUS, NAME_HOUSING_TYPE, TARGET) %>%
GGally::ggpairs(aes(color = TARGET, alpha = 0.5)) +
theme_light() +
ggeasy::easy_rotate_x_labels(angle = 90) +
ggeasy::easy_all_text_size(size = 10)dt1 %>%
select(TARGET, NAME_CONTRACT_TYPE, NAME_FAMILY_STATUS, NAME_HOUSING_TYPE) %>%
# , ORGANIZATION_TYPE) %>%
GGally::ggpairs(aes(color = TARGET, alpha = 0.5)) +
theme_light() +
ggeasy::easy_rotate_x_labels(angle = 90) +
ggeasy::easy_all_text_size(size = 10)dt1 <- dt1 %>%
mutate(TARGET = as.factor(TARGET)) %>%
select(-SK_ID_CURR)rec <- recipe(TARGET ~ ., data = dt1)recData Recipe
Inputs:
rec_zv <- recipes::step_zv(rec, all_predictors())bake(prep(rec_zv), new_data = dt1)rec_nzv <- recipes::step_nzv(rec_zv, all_predictors())bake(prep(rec_nzv), new_data = dt1)rec_ranged <- recipes::step_range(rec_nzv, all_numeric())bake(prep(rec_ranged), new_data = dt1)rec_scaled <- recipes::step_scale(rec_ranged, all_numeric())bake(prep(rec_scaled), new_data = dt1)rec_trans_boxcox <- recipes::step_BoxCox(rec_scaled, all_numeric())bake(prep(rec_trans_boxcox), new_data = dt1)rec_dummy <- recipes::step_dummy(rec_trans_boxcox, all_nominal())
rec_dummyData Recipe
Inputs:
Operations:
Zero variance filter on all_predictors()
Sparse, unbalanced variable filter on all_predictors()
Range scaling to [0,1] for all_numeric()
Scaling for all_numeric()
Box-Cox transformation on all_numeric()
Dummy variables from all_nominal()
rec_pca <- recipes::step_pca(rec_dummy, all_predictors())
rec_pcaData Recipe
Inputs:
Operations:
Zero variance filter on all_predictors()
Sparse, unbalanced variable filter on all_predictors()
Range scaling to [0,1] for all_numeric()
Scaling for all_numeric()
Box-Cox transformation on all_numeric()
Dummy variables from all_nominal()
No PCA components were extracted.
rec_combined <- recipe(TARGET ~ ., data = dt1) %>%
step_zv(all_predictors()) %>%
step_nzv(all_predictors()) %>%
step_range(all_numeric()) %>%
step_scale(all_numeric()) %>%
step_BoxCox(all_numeric()) %>%
step_dummy(all_nominal())prepd_rec <- prep(rec_combined)
bake(prepd_rec, new_data = dt1)rec_combined_pca <- rec_combined %>%
step_pca(all_predictors())bake(prep(rec_combined_pca), new_data = dt1)Error in svd(x, nu = 0, nv = k) : infinite or missing values in 'x'
rec_combined2 <- rec_combined %>%
step_medianimpute(all_numeric()) %>%
step_modeimpute(all_nominal())
rec_combined2Data Recipe
Inputs:
Operations:
Zero variance filter on all_predictors()
Sparse, unbalanced variable filter on all_predictors()
Range scaling to [0,1] for all_numeric()
Scaling for all_numeric()
Box-Cox transformation on all_numeric()
Dummy variables from all_nominal()
Median Imputation for all_numeric()
Mode Imputation for all_nominal()
rec_combined_pca2 <- recipe(TARGET ~ ., data = dt1) %>%
step_zv(all_predictors()) %>%
step_nzv(all_predictors()) %>%
step_range(all_numeric()) %>%
step_scale(all_numeric()) %>%
step_BoxCox(all_numeric()) %>%
step_dummy(all_nominal()) %>%
step_medianimpute(all_numeric()) %>%
step_modeimpute(all_nominal()) %>%
step_pca(all_predictors())tidied_pca <- tidy(prep(rec_combined_pca2), 9)
tidied_pcatidied_pca %>%
filter(component %in% paste0("PC", 1:5)) %>%
ggplot(aes(x = value, y = terms, fill = terms)) +
geom_col(show.legend = FALSE) +
facet_wrap(~component, nrow=1) +
labs(y = NULL) +
theme_classic() +
theme(axis.text = element_text(size = 7))tidied_pca %>%
filter(component %in% paste0("PC", 1:4)) %>%
group_by(component) %>%
top_n(8, abs(value)) %>%
ungroup()tidied_pca %>%
filter(component %in% paste0("PC", 1:4)) %>%
group_by(component) %>%
top_n(8, abs(value)) %>%
ungroup() %>%
ggplot(aes(x = abs(value), y = terms, fill = value > 0)) +
geom_col() +
theme_light()tidied_pca %>%
filter(component %in% paste0("PC", 1:4)) %>%
group_by(component) %>%
top_n(8, abs(value)) %>%
ungroup() %>%
ggplot(aes(abs(value), terms, fill = value > 0)) +
geom_col() +
theme_light() +
facet_wrap(~component, scales = "free_y")library(tidytext)
tidied_pca %>%
filter(component %in% paste0("PC", 1:4)) %>%
group_by(component) %>%
top_n(8, abs(value)) %>%
ungroup() %>%
mutate(terms = reorder_within(terms, abs(value), component)) %>%
ggplot(aes(abs(value), terms, fill = value > 0)) +
geom_col() +
theme_light() +
scale_y_reordered() +
facet_wrap(~component, scales = "free_y") +
labs(y = NULL, fill = "Positive?")juice(prep(rec_combined_pca2)) %>%
ggplot(aes(PC1, PC2, label = name)) +
geom_point(alpha = 0.7, size = 2, aes(color = TARGET)) +
geom_text(check_overlap = TRUE, hjust = "inward", family = "IBMPlexSans") +
labs(color = NULL) +
theme_classic()Error in FUN(X[[i]], ...) : object 'TARGET' not found